In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
from warnings import filterwarnings 
filterwarnings("ignore")
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')              
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
In [2]:
data=pd.read_csv("C:\\Users\\laxma\\Downloads\\advertising (1).csv")
data
Out[2]:
Daily Time Spent on Site Age Area Income Daily Internet Usage Ad Topic Line City Male Country Timestamp Clicked on Ad
0 68.95 35 61833.90 256.09 Cloned 5thgeneration orchestration Wrightburgh 0 Tunisia 2016-03-27 00:53:11 0
1 80.23 31 68441.85 193.77 Monitored national standardization West Jodi 1 Nauru 2016-04-04 01:39:02 0
2 69.47 26 59785.94 236.50 Organic bottom-line service-desk Davidton 0 San Marino 2016-03-13 20:35:42 0
3 74.15 29 54806.18 245.89 Triple-buffered reciprocal time-frame West Terrifurt 1 Italy 2016-01-10 02:31:19 0
4 68.37 35 73889.99 225.58 Robust logistical utilization South Manuel 0 Iceland 2016-06-03 03:36:18 0
... ... ... ... ... ... ... ... ... ... ...
995 72.97 30 71384.57 208.58 Fundamental modular algorithm Duffystad 1 Lebanon 2016-02-11 21:49:00 1
996 51.30 45 67782.17 134.42 Grass-roots cohesive monitoring New Darlene 1 Bosnia and Herzegovina 2016-04-22 02:07:01 1
997 51.63 51 42415.72 120.37 Expanded intangible solution South Jessica 1 Mongolia 2016-02-01 17:24:57 1
998 55.55 19 41920.79 187.95 Proactive bandwidth-monitored policy West Steven 0 Guatemala 2016-03-24 02:35:54 0
999 45.01 26 29875.80 178.35 Virtual 5thgeneration emulation Ronniemouth 0 Brazil 2016-06-03 21:43:21 1

1000 rows × 10 columns

In [3]:
data.head()
Out[3]:
Daily Time Spent on Site Age Area Income Daily Internet Usage Ad Topic Line City Male Country Timestamp Clicked on Ad
0 68.95 35 61833.90 256.09 Cloned 5thgeneration orchestration Wrightburgh 0 Tunisia 2016-03-27 00:53:11 0
1 80.23 31 68441.85 193.77 Monitored national standardization West Jodi 1 Nauru 2016-04-04 01:39:02 0
2 69.47 26 59785.94 236.50 Organic bottom-line service-desk Davidton 0 San Marino 2016-03-13 20:35:42 0
3 74.15 29 54806.18 245.89 Triple-buffered reciprocal time-frame West Terrifurt 1 Italy 2016-01-10 02:31:19 0
4 68.37 35 73889.99 225.58 Robust logistical utilization South Manuel 0 Iceland 2016-06-03 03:36:18 0
In [4]:
data.tail()
Out[4]:
Daily Time Spent on Site Age Area Income Daily Internet Usage Ad Topic Line City Male Country Timestamp Clicked on Ad
995 72.97 30 71384.57 208.58 Fundamental modular algorithm Duffystad 1 Lebanon 2016-02-11 21:49:00 1
996 51.30 45 67782.17 134.42 Grass-roots cohesive monitoring New Darlene 1 Bosnia and Herzegovina 2016-04-22 02:07:01 1
997 51.63 51 42415.72 120.37 Expanded intangible solution South Jessica 1 Mongolia 2016-02-01 17:24:57 1
998 55.55 19 41920.79 187.95 Proactive bandwidth-monitored policy West Steven 0 Guatemala 2016-03-24 02:35:54 0
999 45.01 26 29875.80 178.35 Virtual 5thgeneration emulation Ronniemouth 0 Brazil 2016-06-03 21:43:21 1
In [5]:
data.describe()
Out[5]:
Daily Time Spent on Site Age Area Income Daily Internet Usage Male Clicked on Ad
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.00000
mean 65.000200 36.009000 55000.000080 180.000100 0.481000 0.50000
std 15.853615 8.785562 13414.634022 43.902339 0.499889 0.50025
min 32.600000 19.000000 13996.500000 104.780000 0.000000 0.00000
25% 51.360000 29.000000 47031.802500 138.830000 0.000000 0.00000
50% 68.215000 35.000000 57012.300000 183.130000 0.000000 0.50000
75% 78.547500 42.000000 65470.635000 218.792500 1.000000 1.00000
max 91.430000 61.000000 79484.800000 269.960000 1.000000 1.00000
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.2+ KB
In [7]:
data.isnull().sum()
Out[7]:
Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64
In [8]:
data.duplicated().sum()
Out[8]:
0
In [9]:
#VISUALIZATION
In [10]:
plt.bar(data['Age'],data['Area Income'])
plt.xticks(rotation=90)
plt.show()
In [11]:
fig=px.bar(data,x='Country',y='Male',color='Country')
fig.show()
In [12]:
fig=px.violin(data,x='Clicked on Ad',y='Daily Internet Usage',color='Clicked on Ad')
fig.show()
In [13]:
fig=px.bar(data,x='Age',y='Ad Topic Line',color='Ad Topic Line')
fig.show()
In [14]:
plt.bar(data['Clicked on Ad'],data['Daily Time Spent on Site'])
plt.scatter(data['Age'],data['Area Income'],color='red')
plt.xticks(rotation=90)
plt.show()
In [15]:
plt.figure(figsize=(10,4))
sns.countplot(x='Clicked on Ad', data=data, color='cyan')
plt.title('Clicked on Ad')
plt.show()
In [17]:
sns.lineplot(x='Clicked on Ad', y='Age', data=data).set_title('Clicked on Ad by Age')
Out[17]:
Text(0.5, 1.0, 'Clicked on Ad by Age')
In [18]:
sns.barplot(data['Male'],data['Age'],color='r')
plt.xticks(rotation=90)
plt.show()
In [19]:
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='Daily Time Spent on Site', y='Age')
plt.title('Daily Time Spent on Site in age')
plt.xlabel('Daily Time Spent on Site')
plt.ylabel('Age')
plt.show()
In [20]:
sns.displot(data["Daily Internet Usage"])
Out[20]:
<seaborn.axisgrid.FacetGrid at 0x147bb1143d0>
In [21]:
sns.relplot(x='Clicked on Ad',y='Area Income',data=data)
Out[21]:
<seaborn.axisgrid.FacetGrid at 0x147bcb99460>
In [22]:
plt.figure(figsize=(8, 6))
data.Age.hist(bins=data.Age.nunique())
plt.xlabel('Age')
Out[22]:
Text(0.5, 0, 'Age')
In [23]:
plt.figure(figsize=(8,6))
sns.jointplot(x=data["Area Income"],y=data.Age)
Out[23]:
<seaborn.axisgrid.JointGrid at 0x147bb2456d0>
<Figure size 800x600 with 0 Axes>
In [24]:
plt.figure(figsize=(8,6))
sns.jointplot(x=data["Daily Time Spent on Site"],y=data.Age, kind='kde')
Out[24]:
<seaborn.axisgrid.JointGrid at 0x147bb245bb0>
<Figure size 800x600 with 0 Axes>
In [25]:
plt.figure(figsize=(8,6))
sns.jointplot(x=data["Daily Time Spent on Site"],y=data["Daily Internet Usage"])
Out[25]:
<seaborn.axisgrid.JointGrid at 0x147bdb9d640>
<Figure size 800x600 with 0 Axes>
In [26]:
sns.pairplot(data)
Out[26]:
<seaborn.axisgrid.PairGrid at 0x147bf277910>
In [27]:
#MODEL BUILDING
In [28]:
data['Clicked on Ad'].value_counts()
Out[28]:
0    500
1    500
Name: Clicked on Ad, dtype: int64
In [29]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred=clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train result:\n ")
        print(f"Accuracy score: {accuracy_score(y_train,pred) * 100:.2f}%")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print(f"confusion matrix:\n{confusion_matrix(y_train,pred)}\n")
            
    elif train==False:
        pred=clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n")
        print(f"Accuracy score: {accuracy_score(y_test,pred) * 100:.2f}%")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")
In [30]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

X = data.drop(['Timestamp', 'Clicked on Ad', 'Ad Topic Line', 'Country', 'City'], axis=1)
y = data['Clicked on Ad']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
 
    
num_columns = ['Daily Time Spent on Site','Age','Area Income','Daily Internet Usage','Male']


ct = make_column_transformer(
     (MinMaxScaler(), num_columns),
     (StandardScaler(), num_columns),
    remainder = 'passthrough'
)

X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)
In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf=LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)

print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=False)
Train result:
 
Accuracy score: 97.43%
CLASSIFICATION REPORT:
                    0           1  accuracy   macro avg  weighted avg
precision    0.964088    0.985207  0.974286    0.974648      0.974527
recall       0.985876    0.962428  0.974286    0.974152      0.974286
f1-score     0.974860    0.973684  0.974286    0.974272      0.974279
support    354.000000  346.000000  0.974286  700.000000    700.000000
confusion matrix:
[[349   5]
 [ 13 333]]

Test Result:

Accuracy score: 97.00%
CLASSIFICATION REPORT:
                    0           1  accuracy   macro avg  weighted avg
precision    0.959732    0.980132      0.97    0.969932      0.970204
recall       0.979452    0.961039      0.97    0.970246      0.970000
f1-score     0.969492    0.970492      0.97    0.969992      0.970005
support    146.000000  154.000000      0.97  300.000000    300.000000
Confusion Matrix: 
 [[143   3]
 [  6 148]]